# Direct outputs not eligible for public release

# E.g., to run
# nohup R CMD BATCH --no-save --no-restore '--args outcomes=c("db_w2_wages","per_adult_total_empl_income")' & #nolint

args <- (commandArgs(TRUE))
if (length(args) > 0) {
    for (i in 1:length(args)) {
        eval(parse(text = args[[i]]))
    }
}

# GLOBAL SETTINGS --------------------------------------------------------------

options(
    scipen = 999,
    digits = 16,
    max.print = .Machine$integer.max,
    show.error.locations = TRUE,
    warn = 1
)

RNGkind("L'Ecuyer-CMRG")
seed <- 818675309L
set.seed(seed) # setting main seed

# PACKAGES ---------------------------------------------------------------------
library(data.table)
library(Matrix)
library(zoo)
library(multiwayvcov)
library(lmtest)
library(checkmate)
library(futile.logger)
library(lfe)

library(remotes)
remotes::install_github("setzler/eventStudy/eventStudy")
library(eventStudy) # for DiD-IV

# PACKAGE SETTINGS -------------------------------------------------------------

# data.table
setDTthreads(threads = 1L)
options(datatable.print.class = TRUE, datatable.print.keys = TRUE)
# so that printing the data.table also shows the variable type on top

# BEGIN FILE -------------------------------------------------------------------

source("~/code/0-utility-functions/wald_es.R", local = TRUE)

MakeIntExtInputs <- function(outcome) {

    # Read in lottery panel data
    lottery_panel <- readRDS("~/population-panel-data/lottery_panel_data.rds")

    # For all outcomes, replace with zero if missing
    if (
        (class(lottery_panel[[outcome]]) == "integer") ||
            (class(lottery_panel[[outcome]]) == "numeric")
    ) {
        lottery_panel[is.na(get(outcome)), (outcome) := 0]
    }

    # Similarly, for AGI (used to construct quartiles)
    lottery_panel[is.na(per_adult_adjgross), per_adult_adjgross := 0]

    # Each treated cohort will be restricted to age 21-64 in year 0 and
    # only control units age 21-64 in the same calendar year will be used
    lottery_panel[
        ,
        age_case := as.logical(between(age, 21, 64, incbounds = TRUE))
    ]

    # Just in case
    lottery_panel[is.na(db_w2_wages), db_w2_wages := 0]
    lottery_panel[is.na(has_w2_wages), has_w2_wages := 0]
    lottery_panel[
        is.na(per_adult_total_empl_income),
        per_adult_total_empl_income := 0
    ]
    lottery_panel[is.na(total_employment), total_employment := 0]

    ES_data <- list()
    for (ntile_val in 1:4) {
        ES_data[[ntile_val]] <- ES_clean_data(
            long_data = copy(lottery_panel),
            outcomevar = outcome,
            unit_var = "tin",
            cal_time_var = "tax_yr",
            onset_time_var = "win_yr",
            cluster_vars = "tin",
            omitted_event_time = -2,
            discrete_covars = c("age"),
            control_subset_var = "age_case",
            control_subset_event_time = 0,
            treated_subset_var = "age_case",
            treated_subset_event_time = 0,
            anticipation = 0,
            ntile_var = "per_adult_adjgross",
            ntile_event_time = -2,
            ntiles = 4,
            ntile_var_value = ntile_val,
            ntile_avg = FALSE
        )

        ES_data[[ntile_val]][, income_quartile := as.integer(ntile_val)]
    }
    ntile_val <- NULL
    rm(ntile_val)

    ES_data <- rbindlist(ES_data, use.names = TRUE)

    # Let's focus on event times between +1 and +5 -- rest aren't of interest
    # Unfortunately, catt_specfic_sample isn't a very useful numbering
    # Want to identify a pair:
    # -- ref_onset_time X (ref_event_time != omitted_event_time)
    ES_data[
        ,
        effect_time :=
            sum2(
                ((ref_event_time != omitted_event_time) * ref_event_time),
                na.rm = TRUE
            ),
        by = .(tin, ref_onset_time, catt_specific_sample, income_quartile, treated)
    ]

    ES_data <- ES_data[effect_time > 0 & effect_time < 6]

    # All of the inputs
    ES_data[, ext_marg_var := as.integer(get(outcome) != 0)]

    # auxilliary : cohort size (for weights)
    auxilliary <-
        ES_data[ref_event_time != omitted_event_time & treated == 1,
            .(catt_treated_unique_units = .N),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    # post-win / w+l inputs
    ext_means_post_treated <-
        ES_data[ref_event_time != omitted_event_time & treated == 1,
            .(ext_marg_var_post_treated = mean(ext_marg_var)),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    ext_means_post_control <-
        ES_data[ref_event_time != omitted_event_time & treated == 0,
            .(ext_marg_var_post_control = mean(ext_marg_var)),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    conditional_earnings_post_treated <-
        ES_data[ref_event_time != omitted_event_time & treated == 1 & ext_marg_var == 1,
            .(conditional_earnings_post_treated = mean(get(outcome))),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    conditional_earnings_post_control <-
        ES_data[ref_event_time != omitted_event_time & treated == 0 & ext_marg_var == 1,
            .(conditional_earnings_post_control = mean(get(outcome))),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    post_win_inputs <-
        merge(
            ext_means_post_treated,
            ext_means_post_control,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    post_win_inputs <-
        merge(
            post_win_inputs,
            conditional_earnings_post_treated,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    post_win_inputs <-
        merge(post_win_inputs,
            conditional_earnings_post_control,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    # pre-win / w-s inputs

    ext_means_pre_treated <-
        ES_data[ref_event_time == omitted_event_time & treated == 1,
            .(ext_marg_var_pre_treated = mean(ext_marg_var)),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    ext_means_pre_control <-
        ES_data[ref_event_time == omitted_event_time & treated == 0,
            .(ext_marg_var_pre_control = mean(ext_marg_var)),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    conditional_earnings_pre_treated <-
        ES_data[ref_event_time == omitted_event_time & treated == 1 & ext_marg_var == 1,
            .(conditional_earnings_pre_treated = mean(get(outcome))),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    conditional_earnings_pre_control <-
        ES_data[ref_event_time == omitted_event_time & treated == 0 & ext_marg_var == 1,
            .(conditional_earnings_pre_control = mean(get(outcome))),
            by = .(
                ref_onset_time,
                effect_time,
                income_quartile
            )
        ][order(
            ref_onset_time,
            effect_time,
            income_quartile
        )]

    pre_win_inputs <-
        merge(
            ext_means_pre_treated,
            ext_means_pre_control,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    pre_win_inputs <-
        merge(
            pre_win_inputs,
            conditional_earnings_pre_treated,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    pre_win_inputs <-
        merge(
            pre_win_inputs,
            conditional_earnings_pre_control,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    # Merge everything together wide and calculate alphas and betas

    inputs <-
        merge(
            post_win_inputs,
            pre_win_inputs,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    inputs <-
        merge(inputs,
            auxilliary,
            by = c(
                "ref_onset_time",
                "effect_time",
                "income_quartile"
            ),
            sort = FALSE
        )

    saveRDS(
        inputs,
        sprintf(
            "~/estimation-output/intensive_extensive_decomp_inputs_%s.rds",
            outcome
        )
    )

    return(outcome)
}

num_cores <- length(outcomes)
mcmapply(
    MakeIntExtInputs,
    outcomes,
    SIMPLIFY = FALSE,
    mc.silent = FALSE,
    mc.cores = num_cores,
    mc.set.seed = TRUE
)
